Environmental Data Analysis

library(mongolstats)
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
library(plotly)

nso_options(mongolstats.lang = "en")

Overview

Air quality is a primary environmental determinant of health in Mongolia. The NSO’s network of monitoring stations provides high-resolution data essential for exposure assessment. Unlike aggregated regional data, these station-level measurements allow for localized exposure assessment—critical for environmental health research.

This guide demonstrates how to: - Analyze air quality trends across monitoring stations - Compare pollutant levels against regulatory standards - Map water quality across river basins - Track climate-related health risks (dust exposure)

Air Quality Monitoring

Pollutant Concentrations by Station

The NSO maintains air quality monitoring stations that measure key pollutants. We’ll focus on the Top 10 most polluted stations in Ulaanbaatar to understand the severity of exposure.

# 1. Identify Top 10 Polluted Stations in UB (using 2024 annual data)
air_annual <- nso_data(
  "DT_NSO_2024_135V01",
  selections = list(Year = "2024"),
  labels = "en"
) |>
  filter(!is.na(value)) |>
  mutate(
    Station = str_trim(`Station location_en`),
    Pollutant = str_trim(Indicator_en)
  )

# Define UB Stations
ub_stations <- c(
  "Misheel-Expo center", "West crossroad", "1st micro district", 
  "13th micro district", "32nd Toirog", "Ofitseruudiin ordon", 
  "Kharkhorin market", "Urgakh naran micro district", "Dambdarjaa", 
  "Khailaast", "Tolgoit", "Zuragt", "Amgalan", "Nisekh", 
  "Tavan buudal", "Bayankhoshuu", "Sharkhad", "100 ail"
)

# Rank UB stations by PM2.5 levels
top_10_stations <- air_annual |>
  filter(Station %in% ub_stations) |>
  filter(str_detect(Pollutant, "PM2.5")) |>
  arrange(desc(value)) |>
  head(10) |>
  pull(Station)

# 2. Fetch Monthly Data for Detailed Trends (2023-2025)
air_monthly <- nso_data(
  "DT_NSO_2400_015V2",
  selections = list(
    Year = as.character(2023:2025)
  ),
  labels = "en"
) |>
  filter(!is.na(value)) |>
  mutate(
    Station = str_trim(Location_en),
    Pollutant_Raw = str_trim(`Indicator of air pollution_en`),
    Date = as.Date(paste0(Month_en, "-01"))
  ) |>
  # Explicit date filter to ensure only 2023-2025 data (Year selection may not work)
  filter(Date >= as.Date("2021-01-01"), Date <= as.Date("2025-12-31")) |>
  filter(Station %in% top_10_stations)


# Clean and Reshape Data for Ribbons
# We need Average, Min, and Max for PM2.5 and SO2
air_trends <- air_monthly |>
  mutate(
    Pollutant_Type = case_when(
      str_detect(Pollutant_Raw, "PM2.5") ~ "PM2.5",
      str_detect(Pollutant_Raw, "Sulphur dioxide") ~ "SO2",
      TRUE ~ NA_character_
    ),
    Measure = case_when(
      str_detect(Pollutant_Raw, "Average") ~ "Average",
      str_detect(Pollutant_Raw, "Maximum") ~ "Maximum",
      str_detect(Pollutant_Raw, "Minimum") ~ "Minimum", 
      TRUE ~ "Other"
    )
  ) |>
  filter(!is.na(Pollutant_Type), Measure %in% c("Average", "Maximum")) |>
  # Use Average for the trend line and Maximum for the upper ribbon bound
  # Outlier Removal: Group by Station and Pollutant to detect anomalies
  # Using IQR method (3 * IQR) to remove extreme outliers while keeping natural spikes
  group_by(Station, Pollutant_Type, Measure) |>
  mutate(
    Q1 = quantile(value, 0.25, na.rm = TRUE),
    Q3 = quantile(value, 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    Lower = Q1 - 3 * IQR,
    Upper = Q3 + 3 * IQR,
    Is_Outlier = value < Lower | value > Upper
  ) |>
  filter(!Is_Outlier) |>
  ungroup() |>
  select(Station, Date, Pollutant_Type, Measure, value) |>
  pivot_wider(names_from = Measure, values_from = value)

# Note: If "Minimum" isn't explicitly available, we'll just show Average and Max range

Regulatory Compliance: Exceedance Factor

The Exceedance Factor quantifies how many times a station’s annual average exceeds the regulatory limit. A factor of 1.0 represents the threshold; values >1.0 indicate non-compliance.

To clearly see how much stations exceed safety limits, we calculate the Exceedance Factor (Ratio of Annual Average to MAC). A value > 1.0 indicates non-compliance.

# 1. Get MAC Standards
mac_so2 <- 0.020 # Annual average mg/m3
mac_no2 <- 0.030 # Annual average mg/m3
mac_pm25 <- 0.025 # Annual average mg/m3 (WHO/National target approximation)

# 2. Calculate Exceedance for 2024
compliance <- air_annual |>
  filter(Station %in% top_10_stations) |>
  mutate(
    Pollutant_Type = case_when(
      str_detect(Pollutant, "PM2.5") ~ "PM2.5",
      str_detect(Pollutant, "Sulphur dioxide") ~ "SO2",
      str_detect(Pollutant, "Nitrogen dioxide") ~ "NO2",
      TRUE ~ NA_character_
    ),
    MAC = case_when(
      Pollutant_Type == "PM2.5" ~ mac_pm25,
      Pollutant_Type == "SO2" ~ mac_so2,
      Pollutant_Type == "NO2" ~ mac_no2,
      TRUE ~ NA_real_
    )
  ) |>
  filter(!is.na(Pollutant_Type)) |>
  mutate(
    Exceedance_Factor = value / MAC,
    Status = ifelse(Exceedance_Factor > 1, "Non-Compliant", "Compliant")
  )

# 3. Plot
p_comp <- compliance |>
  ggplot(aes(x = reorder(Station, Exceedance_Factor), y = Exceedance_Factor, fill = Status,
             text = paste0("<b>", Station, "</b><br>",
                           "Pollutant: ", Pollutant_Type, "<br>",
                           "Factor: ", round(Exceedance_Factor, 1), "x Limit"))) +
  geom_col() +
  geom_hline(yintercept = 1, linetype = "dashed", color = "black", linewidth = 1) +
  facet_wrap(~Pollutant_Type, scales = "free_x") +
  coord_flip() +
  scale_fill_manual(values = c("Compliant" = "#27ae60", "Non-Compliant" = "#c0392b")) +
  labs(
    title = "Regulatory Compliance: Exceedance Factors (2024)",
    subtitle = "Ratio of Annual Average to Maximum Allowable Concentration (MAC)",
    y = "Times above Safety Limit",
    x = NULL
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "bottom",
    panel.grid.major.y = element_blank()
  )

plotly::ggplotly(p_comp, tooltip = "text")

Compliance Crisis: 90% of top stations are non-compliant for PM2.5, with exceedance factors reaching 2.0-2.5x (equivalent to annual averages of 0.050-0.063 mg/m³). Only the 13th micro district achieves compliance for PM2.5, suggesting localized factors (wind patterns, lower housing density, or cleaner fuel access) warrant investigation for replication elsewhere.

SO₂ non-compliance is universal across all 10 stations, with factors of 2-4x, underscoring the pervasive reliance on raw coal for heating.

Water Quality Monitoring

Electrical conductivity serves as a proxy for total dissolved solids and water mineralization. Values >1,000 µS/cm typically indicate significant mineral content; values >10,000 µS/cm suggest either natural saline conditions (endorheic basins) or severe anthropogenic pollution.

Water quality is monitored at specific river and lake stations across Mongolia’s major basins. We focus on the Top 15 stations with the highest electrical conductivity.

# Fetch water quality data
water <- nso_data(
  "DT_NSO_2300_005V12",
  selections = list(),
  labels = "en"
) |>
  filter(!is.na(value)) |>
  mutate(
    Station = str_trim(`Station location_en`),
    Indicator = str_trim(Indicator_en)
  )

# Focus on electrical conductivity & Top 15
conductivity <- water |>
  filter(str_detect(Indicator, "Electrical conductivity")) |>
  slice_max(order_by = value, n = 15)

# Plot
p <- conductivity |>
  mutate(Station = reorder(Station, value)) |>
  ggplot(aes(x = Station, y = value, fill = value,
             text = paste0("<b>", Station, "</b><br>",
                           "Conductivity: ", scales::comma(value), " μS/cm"))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_viridis_c(option = "viridis") +
  labs(
    title = "Top 15 Water Stations by Conductivity (2023)",
    subtitle = "Higher values may indicate increased mineralization or pollution",
    x = NULL,
    y = "Conductivity (μS/cm)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank()
  )

plotly::ggplotly(p, tooltip = "text")

Geographic Clustering: The top 2 stations (Taigam/Delger and Uvs/Davst) show extreme conductivity (~20,000 µS/cm), likely reflecting natural saline lake/wetland systems in western Mongolia. In contrast, Khyargas shows high conductivity (~10,500 µS/cm), while other river monitoring stations (e.g., Ganga) show mid-range values (~5,000 µS/cm), which may indicate mining runoff, agricultural inputs, or urban wastewater contamination. Baseline salinity data and upstream land-use analysis are needed for definitive source attribution.

Climate Exposure: Dust Days

Dust storms pose respiratory health risks through coarse particulate (PM10) inhalation and serve as a climate change indicator. Desertification and reduced soil moisture amplify dust mobilization in Mongolia’s Gobi region.

Dust storms are a significant environmental health concern. Here are the Top 10 stations with the highest number of dust days in 2024.

# Fetch dust day data
dust <- nso_data(
  "DT_NSO_2400_028V1",
  selections = list(),
  labels = "en"
) |>
  filter(!is.na(value)) |>
  mutate(
    Station = str_trim(`Station location_en`),
    Month_date = Month_en
  )

# Parse dates and aggregate to annual
dust_annual <- dust |>
  mutate(
    Year = as.numeric(str_sub(Month_date, 1, 4))
  ) |>
  filter(Year == 2024) |>
  group_by(Station) |>
  summarize(
    Total_dust_days = sum(value, na.rm = TRUE),
    .groups = "drop"
  ) |>
  slice_max(Total_dust_days, n = 10)

p <- dust_annual |>
  mutate(Station = reorder(Station, Total_dust_days)) |>
  ggplot(aes(x = Station, y = Total_dust_days, fill = Total_dust_days,
             text = paste0("<b>", Station, "</b><br>",
                           "Dust Days: ", Total_dust_days))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_viridis_c(option = "magma", direction = -1) +
  labs(
    title = "Top 10 Stations for Dust Exposure (2024)",
    subtitle = "Total annual dust days",
    x = NULL,
    y = "Days",
    caption = "Source: NSO Mongolia"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.major.y = element_blank()
  )

plotly::ggplotly(p, tooltip = "text")

Gobi Dust Belt: Sainshand (30 days/year) and Dalanzadgad (21 days/year) represent the core of Mongolia’s dust exposure zone. Chronic dust exposure at this frequency (equivalent to ~6-8% of the year) contributes to elevated respiratory disease burden. Climate projections suggest increasing dust frequency due to reduced precipitation and vegetation loss, particularly in the Gobi-steppe ecotone.

Geographic Context

For spatial analysis, station-level environmental data ideally should be joined with geographic coordinates. While the NSO data provides station names, a separate lookup table with lat/lon coordinates would enable:

Example workflow:

# Hypothetical station metadata table (user would compile this)
station_metadata <- tribble(
  ~Station_name, ~Latitude, ~Longitude, ~Aimag,
  "Ulaanbaatar", 47.9221, 106.9155, "Ulaanbaatar",
  "Darkhan", 49.4869, 105.9228, "Darkhan-Uul",
  "Erdenet", 49.0333, 104.0833, "Orkhon"
  # ... add all stations
)

# Join with environmental data
air_with_location <- air_quality |>
  left_join(station_metadata, by = c("Station" = "Station_name"))

# Now you can create maps, calculate distances, etc.

Next Steps

Key Environmental Tables

Indicator Table_ID Geographic_Level
Air Pollutant Concentrations DT_NSO_2024_135V01 Station
Maximum Allowable Concentrations DT_NSO_2400_015V1 Standard
Water Quality DT_NSO_2300_005V12 Station
Dust Days DT_NSO_2400_028V1 Station
Forest Fires DT_NSO_2400_005V1 Aimag